{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 模型量化\n",
    "\n",
    "\n",
    " 1. 使用 GPTQ 量化 OPT-6.7B 模型。课程代码（ https://github.com/DjangoPeng/LLM-quickstart/blob/main/transformers/AutoGPTQ_transformers.ipynb ）\n",
    "\n",
    " 2. 使用 AWQ 量化 Facebook OPT-6.7B 模型。Facebook OPT 模型地址： https://huggingface.co/facebook?search_models=opt\n",
    "\n",
    "    课程代码： https://github.com/DjangoPeng/LLM-quickstart/blob/main/transformers/AWQ_transformers.ipynb\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. GPTQ量化 opt-6.7b"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. 使用默认的c4数据集进行量化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import torch\n",
    "torch.cuda.is_available()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import subprocess\n",
    "import os\n",
    "\n",
    "result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
    "output = result.stdout\n",
    "for line in output.splitlines():\n",
    "    if '=' in line:\n",
    "        var, value = line.split('=', 1)\n",
    "        os.environ[var] = value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking in indexes: http://mirrors.aliyun.com/pypi/simple\n",
      "Requirement already satisfied: datasets in ./miniconda3/lib/python3.10/site-packages (2.17.0)\n",
      "Requirement already satisfied: numpy>=1.17 in ./miniconda3/lib/python3.10/site-packages (from datasets) (1.26.3)\n",
      "Requirement already satisfied: pandas in ./miniconda3/lib/python3.10/site-packages (from datasets) (2.2.0)\n",
      "Requirement already satisfied: pyarrow>=12.0.0 in ./miniconda3/lib/python3.10/site-packages (from datasets) (15.0.0)\n",
      "Requirement already satisfied: requests>=2.19.0 in ./miniconda3/lib/python3.10/site-packages (from datasets) (2.31.0)\n",
      "Requirement already satisfied: multiprocess in ./miniconda3/lib/python3.10/site-packages (from datasets) (0.70.16)\n",
      "Requirement already satisfied: pyyaml>=5.1 in ./miniconda3/lib/python3.10/site-packages (from datasets) (6.0.1)\n",
      "Requirement already satisfied: dill<0.3.9,>=0.3.0 in ./miniconda3/lib/python3.10/site-packages (from datasets) (0.3.8)\n",
      "Requirement already satisfied: filelock in ./miniconda3/lib/python3.10/site-packages (from datasets) (3.13.1)\n",
      "Requirement already satisfied: xxhash in ./miniconda3/lib/python3.10/site-packages (from datasets) (3.4.1)\n",
      "Requirement already satisfied: packaging in ./miniconda3/lib/python3.10/site-packages (from datasets) (23.2)\n",
      "Requirement already satisfied: aiohttp in ./miniconda3/lib/python3.10/site-packages (from datasets) (3.9.3)\n",
      "Requirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in ./miniconda3/lib/python3.10/site-packages (from datasets) (2023.10.0)\n",
      "Requirement already satisfied: huggingface-hub>=0.19.4 in ./miniconda3/lib/python3.10/site-packages (from datasets) (0.20.3)\n",
      "Requirement already satisfied: tqdm>=4.62.1 in ./miniconda3/lib/python3.10/site-packages (from datasets) (4.64.1)\n",
      "Requirement already satisfied: pyarrow-hotfix in ./miniconda3/lib/python3.10/site-packages (from datasets) (0.6)\n",
      "Requirement already satisfied: async-timeout<5.0,>=4.0 in ./miniconda3/lib/python3.10/site-packages (from aiohttp->datasets) (4.0.3)\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in ./miniconda3/lib/python3.10/site-packages (from aiohttp->datasets) (1.3.1)\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in ./miniconda3/lib/python3.10/site-packages (from aiohttp->datasets) (6.0.5)\n",
      "Requirement already satisfied: attrs>=17.3.0 in ./miniconda3/lib/python3.10/site-packages (from aiohttp->datasets) (23.2.0)\n",
      "Requirement already satisfied: yarl<2.0,>=1.0 in ./miniconda3/lib/python3.10/site-packages (from aiohttp->datasets) (1.9.4)\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in ./miniconda3/lib/python3.10/site-packages (from aiohttp->datasets) (1.4.1)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in ./miniconda3/lib/python3.10/site-packages (from huggingface-hub>=0.19.4->datasets) (4.9.0)\n",
      "Requirement already satisfied: idna<4,>=2.5 in ./miniconda3/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (3.4)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in ./miniconda3/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (1.26.13)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in ./miniconda3/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (2022.12.7)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in ./miniconda3/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (2.0.4)\n",
      "Requirement already satisfied: python-dateutil>=2.8.2 in ./miniconda3/lib/python3.10/site-packages (from pandas->datasets) (2.8.2)\n",
      "Requirement already satisfied: tzdata>=2022.7 in ./miniconda3/lib/python3.10/site-packages (from pandas->datasets) (2024.1)\n",
      "Requirement already satisfied: pytz>=2020.1 in ./miniconda3/lib/python3.10/site-packages (from pandas->datasets) (2024.1)\n",
      "Requirement already satisfied: six>=1.5 in ./miniconda3/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
      "\u001b[0m"
     ]
    }
   ],
   "source": [
    "!pip install -U datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fc7dd59e3c9f490f96966806f4c32556",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:root:Some parameters are on the meta device device because they were offloaded to the cpu.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a1cfb368bea3419caf64ad84d0c6acba",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2edf4cc3b0564e7d8d37718bab19c505",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ca86ae74524b47e2aac1262d63729809",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data:   0%|          | 0.00/6.36M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6cdf40a04d4545e2953b020ff0842556",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "de5023be8f914bdea1d8f5158cb431fc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0f9162ed5c2a44889d4d031bb8c0b6c8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6698811fc3f84b148dc81914304390bd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "10c2a734cb524b7da017e08de78088ce",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing model.decoder.layers blocks :   0%|          | 0/32 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:optimum.gptq.quantizer:Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig\n",
    "import torch\n",
    "\n",
    "model_id = \"/root/autodl-tmp/models/opt-6.7b\"\n",
    "\n",
    "quantization_config = GPTQConfig(\n",
    "     bits=4, # 量化精度\n",
    "     group_size=128,\n",
    "     dataset=\"wikitext2\",\n",
    "     desc_act=False,\n",
    ")\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
    "quant_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map='auto')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. 检查量化模型正确性"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'training': True,\n",
       " '_parameters': OrderedDict(),\n",
       " '_buffers': OrderedDict([('qweight',\n",
       "               tensor([[-1516590485,  2022282410, -1182414405,  ..., -1537760937,\n",
       "                          968190105, -1218869078],\n",
       "                       [-1973762678, -1990888091,  1451669112,  ...,  1767327094,\n",
       "                         1485273242, -1769109111],\n",
       "                       [ -890521657,  1705355194,  2042256023,  ...,  1401453177,\n",
       "                         -963081656, -1212573545],\n",
       "                       ...,\n",
       "                       [-1192544404,   713964148,  1432856439,  ...,  1967750874,\n",
       "                        -1214732472, -1787328359],\n",
       "                       [-1485273703, -1736799852,  1776327546,  ..., -1757500853,\n",
       "                        -1936219177, -1986435164],\n",
       "                       [ 1817626772,  1151063749, -1735997610,  ..., -1334205594,\n",
       "                        -1450811000, -1199921061]], device='cuda:0', dtype=torch.int32)),\n",
       "              ('qzeros',\n",
       "               tensor([[2004318071, 2004318071, 2004318071,  ..., 2004318071, 2004318071,\n",
       "                        2004318071],\n",
       "                       [2004318071, 2004318071, 2004318071,  ..., 2004318071, 2004318071,\n",
       "                        2004318071],\n",
       "                       [2004318071, 2004318071, 2004318071,  ..., 2004318071, 2004318071,\n",
       "                        2004318071],\n",
       "                       ...,\n",
       "                       [2004318071, 2004318071, 2004318071,  ..., 2004318071, 2004318071,\n",
       "                        2004318071],\n",
       "                       [2004318071, 2004318071, 2004318071,  ..., 2004318071, 2004318071,\n",
       "                        2004318071],\n",
       "                       [2004318071, 2004318071, 2004318071,  ..., 2004318071, 2004318071,\n",
       "                        2004318071]], device='cuda:0', dtype=torch.int32)),\n",
       "              ('scales',\n",
       "               tensor([[0.0039, 0.0035, 0.0035,  ..., 0.0030, 0.0028, 0.0044],\n",
       "                       [0.0032, 0.0030, 0.0036,  ..., 0.0034, 0.0028, 0.0033],\n",
       "                       [0.0038, 0.0029, 0.0039,  ..., 0.0043, 0.0024, 0.0035],\n",
       "                       ...,\n",
       "                       [0.0034, 0.0034, 0.0034,  ..., 0.0033, 0.0038, 0.0030],\n",
       "                       [0.0030, 0.0029, 0.0030,  ..., 0.0031, 0.0038, 0.0027],\n",
       "                       [0.0034, 0.0035, 0.0032,  ..., 0.0026, 0.0027, 0.0041]],\n",
       "                      device='cuda:0', dtype=torch.float16)),\n",
       "              ('g_idx',\n",
       "               tensor([ 0,  0,  0,  ..., 31, 31, 31], device='cuda:0', dtype=torch.int32)),\n",
       "              ('bias',\n",
       "               tensor([ 0.0061,  0.0129, -0.0215,  ...,  0.0157, -0.0144,  0.0106],\n",
       "                      device='cuda:0', dtype=torch.float16))]),\n",
       " '_non_persistent_buffers_set': set(),\n",
       " '_backward_pre_hooks': OrderedDict(),\n",
       " '_backward_hooks': OrderedDict(),\n",
       " '_is_full_backward_hook': None,\n",
       " '_forward_hooks': OrderedDict(),\n",
       " '_forward_hooks_with_kwargs': OrderedDict(),\n",
       " '_forward_hooks_always_called': OrderedDict(),\n",
       " '_forward_pre_hooks': OrderedDict(),\n",
       " '_forward_pre_hooks_with_kwargs': OrderedDict(),\n",
       " '_state_dict_hooks': OrderedDict(),\n",
       " '_state_dict_pre_hooks': OrderedDict(),\n",
       " '_load_state_dict_pre_hooks': OrderedDict(),\n",
       " '_load_state_dict_post_hooks': OrderedDict(),\n",
       " '_modules': OrderedDict(),\n",
       " 'infeatures': 4096,\n",
       " 'outfeatures': 4096,\n",
       " 'bits': 4,\n",
       " 'group_size': 128,\n",
       " 'maxq': 15,\n",
       " 'half_indim': 2048,\n",
       " 'use_cuda_fp16': True,\n",
       " 'wf': tensor([[ 0,  4,  8, 12, 16, 20, 24, 28]], dtype=torch.int32),\n",
       " 'kernel_switch_threshold': 128,\n",
       " 'autogptq_cuda_available': True,\n",
       " 'autogptq_cuda': <module 'autogptq_cuda_256' from '/root/miniconda3/lib/python3.10/site-packages/autogptq_cuda_256.cpython-310-x86_64-linux-gnu.so'>,\n",
       " 'trainable': False,\n",
       " 'device': device(type='cuda', index=0)}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "quant_model.model.decoder.layers[0].self_attn.q_proj.__dict__"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. 加载模型并生成文本"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Merry Christmas! I'm glad to see you're still around.\n",
      "I'm still around, just not as much. I'm still here, just not as much.\n"
     ]
    }
   ],
   "source": [
    "text = \"Merry Christmas! I'm glad to\"\n",
    "inputs = tokenizer(text, return_tensors=\"pt\").to(0)\n",
    "\n",
    "out = quant_model.generate(**inputs, max_new_tokens=64)\n",
    "print(tokenizer.decode(out[0], skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4. 重新量化并保存"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4491224114ba434b9361a1ce39a70ca7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/root/miniconda3/lib/python3.10/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
      "  return self.fget.__get__(instance, owner)()\n",
      "Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub\n",
      "Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Sat Feb 17 01:21:51 2024).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "15928f6cf6384cc9a702108e739a003b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing model.decoder.layers blocks :   0%|          | 0/32 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig\n",
    "import torch\n",
    "\n",
    "model_id = \"/root/autodl-tmp/models/opt-6.7b\"\n",
    "\n",
    "quantization_config = GPTQConfig(\n",
    "     bits=4, # 量化精度\n",
    "     group_size=128,\n",
    "     dataset=\"wikitext2\",\n",
    "     desc_act=False,\n",
    ")\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
    "quant_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map='auto')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "OPTForCausalLM(\n",
       "  (model): OPTModel(\n",
       "    (decoder): OPTDecoder(\n",
       "      (embed_tokens): Embedding(50272, 4096, padding_idx=1)\n",
       "      (embed_positions): OPTLearnedPositionalEmbedding(2050, 4096)\n",
       "      (final_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
       "      (layers): ModuleList(\n",
       "        (0-31): 32 x OPTDecoderLayer(\n",
       "          (self_attn): OPTAttention(\n",
       "            (k_proj): QuantLinear()\n",
       "            (out_proj): QuantLinear()\n",
       "            (q_proj): QuantLinear()\n",
       "            (v_proj): QuantLinear()\n",
       "          )\n",
       "          (activation_fn): ReLU()\n",
       "          (self_attn_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
       "          (final_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): QuantLinear()\n",
       "          (fc2): QuantLinear()\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "  )\n",
       "  (lm_head): Linear(in_features=4096, out_features=50272, bias=False)\n",
       ")"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "quant_model.eval()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'OPTForCausalLM' object has no attribute 'save_quantized'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[3], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m quant_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodels/opt-6.7b-gptq\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mquant_model\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msave_quantized\u001b[49m(quant_path)\n\u001b[1;32m      3\u001b[0m tokenizer\u001b[38;5;241m.\u001b[39msave_pretrained(quant_path)  \u001b[38;5;66;03m# 保存分词器\u001b[39;00m\n",
      "File \u001b[0;32m~/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1695\u001b[0m, in \u001b[0;36mModule.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m   1693\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m modules:\n\u001b[1;32m   1694\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m modules[name]\n\u001b[0;32m-> 1695\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m object has no attribute \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
      "\u001b[0;31mAttributeError\u001b[0m: 'OPTForCausalLM' object has no attribute 'save_quantized'"
     ]
    }
   ],
   "source": [
    "quant_path = \"models/opt-6.7b-gptq\"\n",
    "quant_model.save_quantized(quant_path)\n",
    "tokenizer.save_pretrained(quant_path)  # 保存分词器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('models/opt-6.7b-gptq/tokenizer_config.json',\n",
       " 'models/opt-6.7b-gptq/special_tokens_map.json',\n",
       " 'models/opt-6.7b-gptq/vocab.json',\n",
       " 'models/opt-6.7b-gptq/merges.txt',\n",
       " 'models/opt-6.7b-gptq/added_tokens.json',\n",
       " 'models/opt-6.7b-gptq/tokenizer.json')"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "quant_path = \"models/opt-6.7b-gptq\"\n",
    "quant_model.save_pretrained(quant_path)\n",
    "tokenizer.save_pretrained(quant_path)  # 保存分词器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
